library(readxl)
library(readr)
library(CTexploreR)
library(Vennerable)
library(biomaRt)
library(tidyverse)
library(SummarizedExperiment)
library(UpSetR)
library(ComplexHeatmap)
library(circlize)
library(SingleCellExperiment)
library(org.Hs.eg.db)
library(clusterProfiler)
library(msigdbr)
library(DOSE)
library(BiocParallel)
library(patchwork)
library(Biostrings)
Gene names/synonyms required for databases cleaning
ensembl <- biomaRt::useDataset("hsapiens_gene_ensembl", useMart("ensembl"))
attributes_vector <- c("ensembl_gene_id", "external_gene_name",
"external_synonym", "gene_biotype",
"chromosome_name", "band", "start_position", "end_position",
"strand")
ensembl_gene_synonym <- as_tibble(getBM(attributes = attributes_vector, mart = ensembl))
ensembl_gene_synonym <- ensembl_gene_synonym %>%
mutate(external_synonym = sub(pattern = "ORF", external_synonym,
replacement = "orf"))
attributes_vector <- c("ensembl_gene_id", "external_gene_name")
ensembl_gene_names <- as_tibble(getBM(attributes = attributes_vector, mart = ensembl))
attributes_vector <- c("external_gene_name",
"external_synonym")
gene_synonym <- as_tibble(getBM(attributes = attributes_vector, mart = ensembl))
GTEX_data <- CTdata::GTEX_data()
## Error in readRDS(file) : error reading from connection
normal_tissues_multimapping_data <- CTdata::normal_tissues_multimapping_data()
CCLE_data <- CTdata::CCLE_data()
TCGA_TPM <- CTdata::TCGA_TPM()
DAC_treated_cells_multimapping <- CTdata::DAC_treated_cells_multimapping()
testis_sce <- CTdata::testis_sce()
scRNAseq_HPA <- CTdata::scRNAseq_HPA()
CT_mean_methylation_in_tissues <- CTdata::CT_mean_methylation_in_tissues()
####
load("../CT_genes_latest_version.rda")
CT_genes$testis_specificity <- "testis_specific"
Common figures parameters
legends_param <- list(
labels_gp = gpar(col = "black", fontsize = 6),
title_gp = gpar(col = "black", fontsize = 6),
row_names_gp = gpar(fontsize = 4),
annotation_name_side = "left")
legend_colors <- c("#5E4FA2", "#3288BD", "#66C2A5", "#ABDDA4", "#E6F598",
"#FFFFBF", "#FEE08B", "#FDAE61", "#F46D43", "#D53E4F",
"#9E0142")
chr_colors <- c("X-linked" = "deeppink", "Not X" = "royalblue1")
meth_colors <- c("Methylation" = "lightgreen", "Not methylation" = "gray")
CT lists from other databases have been checked (using GTEx and our
GTEx_expression() funtion and GeneCards) in order to remove
duplicated gene names or deprecated ones and allow comparison between
databases.
Online list copied in a csv file, several lists exist so we combined them.
We checked gene names that were a concatenation of two genes (choice using biomaRt synonyms to get the official one), checked which ones had the right names, removed duplicated genes, verified lost genes and added back those that should be there.
CTdatabase <- read_delim("data/CTdatabase1.csv", delim = ";",
escape_double = FALSE, trim_ws = TRUE)
colnames(CTdatabase) <- c("Family", "Gene_Name", "Chromosomal_localization",
"CT_identifier")
CTdatabase_bis <- read_csv2("data/CTdatabase2.csv")
CTdatabase <- left_join(CTdatabase, CTdatabase_bis,
by = c("Gene_Name" = "Gene_Symbol"))
CTdatabase_single <- CTdatabase %>%
mutate(Gene_Name = sub(pattern = "/.*$", Gene_Name, replacement = ""))
CTdatabase_single <- CTdatabase_single %>%
mutate(Gene_Name = sub(pattern = ",.*$", Gene_Name, replacement = ""))
CTdatabase_official_names <-
unique(dplyr::select(ensembl_gene_synonym, ensembl_gene_id,
external_gene_name)) %>%
filter(external_gene_name %in% CTdatabase_single$Gene_Name) %>%
mutate(Gene_Name = external_gene_name) %>%
mutate(external_synonym = NA)
CTdatabase_synonym <-
ensembl_gene_synonym %>%
filter(external_synonym %in% CTdatabase_single$Gene_Name) %>%
mutate(Gene_Name = external_synonym) %>%
dplyr::select(ensembl_gene_id, external_gene_name, Gene_Name, external_synonym)
CTdatabase_cleaned <-
rbind(CTdatabase_official_names, CTdatabase_synonym) %>%
left_join(CTdatabase_single)
duplicated_genes <- CTdatabase_cleaned$Gene_Name[duplicated(CTdatabase_cleaned$Gene_Name)]
bad_ids <- ensembl_gene_synonym %>%
filter(external_gene_name %in% duplicated_genes | external_synonym %in% duplicated_genes) %>%
filter(chromosome_name %in% grep(pattern = "H", x = chromosome_name, value = TRUE)) %>%
pull(ensembl_gene_id)
CTdatabase_cleaned <- CTdatabase_cleaned %>%
dplyr::filter(!ensembl_gene_id %in% bad_ids)
CTdatabase_cleaned <- CTdatabase_cleaned %>%
filter(!ensembl_gene_id == "ENSG00000052126")
CTdatabase_cleaned <- CTdatabase_cleaned %>%
filter(!(ensembl_gene_id == "ENSG00000183305" & Gene_Name == "MAGEA2"))
CTdatabase_cleaned <- CTdatabase_cleaned %>%
filter(!ensembl_gene_id == "ENSG00000204648")
CTdatabase_cleaned <- filter(CTdatabase_cleaned, Gene_Name != "CSAG3B")
CTdatabase_cleaned[CTdatabase_cleaned$Gene_Name == "CSAG2", "external_synonym"] <- "CSAG3B"
CTdatabase_cleaned <- filter(CTdatabase_cleaned, Gene_Name != "CT45A4")
CTdatabase_cleaned[CTdatabase_cleaned$Gene_Name == "CT45A3", "external_synonym"] <- "CT45A4"
CTdatabase_cleaned <- filter(CTdatabase_cleaned, Gene_Name != "LAGE-1b")
CTdatabase_cleaned[CTdatabase_cleaned$Gene_Name == "CTAG2", "external_synonym"] <- "LAGE-1b"
CTdatabase_cleaned <- filter(CTdatabase_cleaned, Gene_Name != "CT16.2")
CTdatabase_cleaned[CTdatabase_cleaned$Gene_Name == "PAGE5", "external_synonym"] <- "CT16.2"
CTdatabase_cleaned <- filter(CTdatabase_cleaned, Gene_Name != "SPANXB2")
CTdatabase_cleaned[CTdatabase_cleaned$Gene_Name == "SPANXB1", "external_synonym"] <- "SPANXB2"
CTdatabase_cleaned <- filter(CTdatabase_cleaned, Gene_Name != "SPANXE")
CTdatabase_cleaned[CTdatabase_cleaned$Gene_Name == "SPANXD", "external_synonym"] <- "SPANXE"
CTdatabase_cleaned <- filter(CTdatabase_cleaned, Gene_Name != "XAGE1C")
CTdatabase_cleaned <- filter(CTdatabase_cleaned, Gene_Name != "XAGE1D")
CTdatabase_cleaned <- filter(CTdatabase_cleaned, Gene_Name != "XAGE1E")
CTdatabase_cleaned <- filter(CTdatabase_cleaned, Gene_Name != "XAGE2B")
CTdatabase_cleaned[CTdatabase_cleaned$Gene_Name == "XAGE2", "external_synonym"] <- "XAGE2B"
CTdatabase_cleaned <- filter(CTdatabase_cleaned, Gene_Name != "CTAGE-2")
CTdatabase_cleaned[CTdatabase_cleaned$Gene_Name == "CTAGE1", "external_synonym"] <- "CTAGE-2"
CTdatabase_cleaned <- ensembl_gene_synonym %>%
mutate(Gene_Name = external_synonym) %>%
filter(external_synonym == "CXorf61") %>%
dplyr::select(ensembl_gene_id, external_gene_name, Gene_Name, external_synonym) %>%
cbind(CTdatabase_single[CTdatabase_single$Gene_Name == "Cxorf61",
c("Family", "Chromosomal_localization", "CT_identifier", "Classification")]) %>%
rbind(CTdatabase_cleaned)
CTdatabase_cleaned <- unique(dplyr::select(ensembl_gene_synonym, ensembl_gene_id, external_gene_name)) %>%
filter(external_gene_name == "CCNA1") %>%
mutate(Gene_Name = external_gene_name) %>%
mutate(external_synonym = NA) %>%
cbind(CTdatabase_single[CTdatabase_single$Gene_Name == "cyclin A1",
c("Family", "Chromosomal_localization", "CT_identifier", "Classification")]) %>%
rbind(CTdatabase_cleaned)
CTdatabase_cleaned <- unique(dplyr::select(ensembl_gene_synonym, ensembl_gene_id, external_gene_name))%>%
filter(external_gene_name == "GOLGA6L2") %>%
filter(ensembl_gene_id == "ENSG00000174450") %>%
mutate(Gene_Name = external_gene_name) %>%
mutate(external_synonym = NA) %>%
cbind(CTdatabase_single[CTdatabase_single$Gene_Name == "GOLGAGL2 FA",
c("Family", "Chromosomal_localization", "CT_identifier", "Classification")]) %>%
rbind(CTdatabase_cleaned)
CTdatabase_cleaned <- unique(dplyr::select(ensembl_gene_synonym, ensembl_gene_id, external_gene_name))%>%
filter(external_gene_name == "LYPD6B") %>%
mutate(Gene_Name = external_gene_name) %>%
mutate(external_synonym = NA) %>%
cbind(CTdatabase_single[CTdatabase_single$Gene_Name == "LOC130576",
c("Family", "Chromosomal_localization", "CT_identifier", "Classification")]) %>%
rbind(CTdatabase_cleaned)
CTdatabase_cleaned <- unique(dplyr::select(ensembl_gene_synonym, ensembl_gene_id, external_gene_name))%>%
filter(external_gene_name == "CT62") %>%
mutate(Gene_Name = external_gene_name) %>%
mutate(external_synonym = NA) %>%
cbind(CTdatabase_single[CTdatabase_single$Gene_Name == "LOC196993",
c("Family", "Chromosomal_localization", "CT_identifier", "Classification")]) %>%
rbind(CTdatabase_cleaned)
CTdatabase_cleaned <- unique(dplyr::select(ensembl_gene_synonym, ensembl_gene_id, external_gene_name))%>%
filter(external_gene_name == "CT75") %>%
filter(ensembl_gene_id == "ENSG00000291155") %>%
mutate(Gene_Name = external_gene_name) %>%
mutate(external_synonym = NA) %>%
cbind(CTdatabase_single[CTdatabase_single$Gene_Name == "LOC440934",
c("Family", "Chromosomal_localization", "CT_identifier", "Classification")]) %>%
rbind(CTdatabase_cleaned)
CTdatabase_cleaned <- unique(dplyr::select(ensembl_gene_synonym, ensembl_gene_id, external_gene_name))%>%
filter(external_gene_name == "LINC01192") %>%
mutate(Gene_Name = external_gene_name) %>%
mutate(external_synonym = NA) %>%
cbind(CTdatabase_single[CTdatabase_single$Gene_Name == "LOC647107",
c("Family", "Chromosomal_localization", "CT_identifier", "Classification")]) %>%
rbind(CTdatabase_cleaned)
CTdatabase_cleaned <- unique(dplyr::select(ensembl_gene_synonym, ensembl_gene_id, external_gene_name))%>%
filter(external_gene_name == "TSPY1") %>%
mutate(Gene_Name = external_gene_name) %>%
mutate(external_synonym = NA) %>%
cbind(CTdatabase_single[CTdatabase_single$Gene_Name == "LOC728137",
c("Family", "Chromosomal_localization", "CT_identifier", "Classification")]) %>%
rbind(CTdatabase_cleaned)
CTdatabase_cleaned <- unique(dplyr::select(ensembl_gene_synonym, ensembl_gene_id, external_gene_name))%>%
filter(external_gene_name == "SSX2B") %>%
mutate(Gene_Name = external_gene_name) %>%
mutate(external_synonym = NA) %>%
cbind(CTdatabase_single[CTdatabase_single$Gene_Name == "SSX2b",
c("Family", "Chromosomal_localization", "CT_identifier", "Classification")]) %>%
rbind(CTdatabase_cleaned)
Excel file coming from supplemental data.
Jamin_core_CT <- read_excel("data/Jamin_core_CT.xlsx")
Jamin_core_CT[Jamin_core_CT$Gene == "KIAA1211", "Gene"] <- "CRACD"
Jamin_core_CT[Jamin_core_CT$Gene == "CXorf67", "Gene"] <- "EZHIP"
Excel file coming from supplemental data.
Wang_CT <- read_excel("data/Wang_Suppl_Data_3.xlsx",
sheet = "Supplementary Data 3B", skip = 1)
colnames(Wang_CT)[1] <- "ensembl_gene_id"
Wang_CT <- ensembl_gene_names %>%
filter(ensembl_gene_id %in% Wang_CT$ensembl_gene_id) %>%
right_join(Wang_CT)
Wang_CT[Wang_CT$ensembl_gene_id == "ENSG00000181013", "external_gene_name"] <- "C17orf47"
Wang_CT[Wang_CT$ensembl_gene_id == "ENSG00000204293", "external_gene_name"] <- "OR8B2"
Wang_CT[Wang_CT$external_gene_name == "", "external_gene_name"] <- "RNASE11"
Wang_CT[Wang_CT$external_gene_name == "CHCT1", "external_gene_name"] <- "C17orf64"
Wang_CT[Wang_CT$external_gene_name == "PRSS40A", "external_gene_name"] <- "TISP43"
Wang_CT[Wang_CT$external_gene_name == "TEX56P", "external_gene_name"] <- "C6orf201"
Wang_CT[Wang_CT$external_gene_name == "SLC25A51P4", "external_gene_name"] <- "RP11-113D6.10"
Wang_CT[Wang_CT$external_gene_name == "TCP10L3", "external_gene_name"] <- "TCP10"
Wang_CT[Wang_CT$external_gene_name == "SCAND3", "external_gene_name"] <- "ZBED9"
Carter_CT_list <- read_table("data/Carter_CT_list.txt", skip = 1)
Carter_CT <- Carter_CT_list[Carter_CT_list$CT_Expression,]
Carter_CT[Carter_CT$Gene == "ENSG00000261649", "Gene_Name"] <- "GOLGA6L7"
Carter_CT[Carter_CT$Gene == "ENSG00000239620", "Gene_Name"] <- "PRR20G"
Carter_CT[Carter_CT$Gene == "ENSG00000168148", "Gene_Name"] <- "H3-4"
Carter_CT[Carter_CT$Gene == "ENSG00000204296", "Gene_Name"] <- "TSBP1"
Carter_CT[Carter_CT$Gene == "ENSG00000180219", "Gene_Name"] <- "GARIN6"
Carter_CT[Carter_CT$Gene == "ENSG00000172717", "Gene_Name"] <- "GARIN2"
Carter_CT[Carter_CT$Gene == "ENSG00000174015", "Gene_Name"] <- "CBY2"
Carter_CT[Carter_CT$Gene == "ENSG00000224960", "Gene_Name"] <- "PPP4R3C"
Excel file from supplemental data.
Bruggeman_data <- read_excel("data/Bruggeman_suppl_data.xlsx", skip = 1,
sheet = "1D")
Bruggeman_official_names <- gene_synonym %>%
dplyr::select(external_gene_name) %>%
unique() %>%
filter(external_gene_name %in% Bruggeman_data$Gene) %>%
mutate(Gene_Name = external_gene_name) %>%
mutate(external_synonym = NA)
Bruggeman_synonym <- gene_synonym %>%
filter(external_synonym %in% Bruggeman_data$Gene) %>%
mutate(Gene_Name = external_synonym) %>%
dplyr::select(external_gene_name, Gene_Name, external_synonym)
Bruggeman_synonym <- Bruggeman_synonym[-which(Bruggeman_synonym$Gene_Name %in%
Bruggeman_official_names$Gene_Name),]
Bruggeman_CT <- rbind(Bruggeman_official_names, Bruggeman_synonym)
lost <- Bruggeman_data[which(!Bruggeman_data$Gene %in% c(Bruggeman_CT$Gene_Name)), "Gene"]
colnames(lost) <- "external_gene_name"
lost$Gene_Name <- rep(NA, nrow(lost))
lost$external_synonym <- rep(NA, nrow(lost))
lost[lost$external_gene_name == "C21orf59", "Gene_Name"] <- "CFAP298"
lost[lost$external_gene_name == "C11orf57", "Gene_Name"] <- "NKAPD1"
lost[lost$external_gene_name == "C7orf55", "Gene_Name"] <- "FMC1"
lost[lost$external_gene_name == "C10orf12", "Gene_Name"] <- "LCOR"
lost[lost$external_gene_name == "RPL19P12", "Gene_Name"] <- "RPL19P12"
lost[lost$external_gene_name == "C16orf59", "Gene_Name"] <- "TEDC2"
lost[lost$external_gene_name == "TTTY15", "Gene_Name"] <- "USP9Y"
lost[lost$external_gene_name == "C17orf53", "Gene_Name"] <- "HROB"
lost[lost$external_gene_name == "C1orf112", "Gene_Name"] <- "FIRRM"
lost[lost$external_gene_name == "C12orf66", "Gene_Name"] <- "KICS2"
lost[lost$external_gene_name == "C9orf84", "Gene_Name"] <- "SHOC1"
lost[lost$external_gene_name == "C10orf25", "Gene_Name"] <- "ZNF22-AS1"
lost[lost$external_gene_name == "C20orf197", "Gene_Name"] <- "LINC02910"
lost[lost$external_gene_name == "C3orf67", "Gene_Name"] <- "CFAP20DC"
lost[lost$external_gene_name == "C8orf37", "Gene_Name"] <- "CFAP418"
lost[lost$external_gene_name == "C22orf34", "Gene_Name"] <- "MIR3667HG"
Bruggeman_CT <- rbind(Bruggeman_CT, lost)
missing_Bruggeman <- c("BMS1P4", "ADAM6", "ANXA2P3", "ARHGAP11B", "DPY19L2P2",
"HLA-L", "PA2G4P4", "PIPSL", "PRKY", "YBX3P1",
"RPL23AP53", "UQCRBP1", "RPL23P8", "MRS2P2", "PIN4P1",
"SLC6A10P", "GUSBP2", "PPIEL", "LRRC37BP1", "MSL3P1",
"PLEKHA8P1", "STAG3L1", "TCAM1P", "ZNF702P", "ZNF815P",
"ATP6AP1L", "RPL21P44", "SEC14L1P1", "ZNF876P",
"RPLP0P2", "FAM86JP", "FAM175A", "LACE1", "ATP5EP2",
"WDR92", "TCTE3", "METTL20", "KIAA2022", "ZNRD1-AS1",
"SGOL1", "FAM35DP", "MTL5", "TMEM14E", "MLLT4-AS1",
"CCDC173", "KIAA1524", "WDR78", "LINC00476", "LYRM5",
"HILS1", "CASC5", "KIAA1919", "CTAGE5", "FAM188B",
"TMEM194B", "FAM122C", "PPP1R2P3", "KIAA0391", "SGOL2",
"FAM19A3", "ZNF788", "RPL19P12", "FIRRM")
external_names_to_keep <- gene_synonym %>%
filter(external_synonym %in% missing_Bruggeman) %>%
filter(!external_gene_name %in% c("ATP5F1EP2", "POLR1HASP", "SHLD2P3",
"TMEM14EP", "H1-9P", "ZNF788P")) %>%
mutate(Gene_Name = external_gene_name)
Bruggeman_CT[Bruggeman_CT$external_synonym %in%
external_names_to_keep$external_synonym,
"Gene_Name"] <- external_names_to_keep$Gene_Name
Bruggeman_CT <- Bruggeman_CT %>%
dplyr::select(Gene_Name)
To characterise the differences between our database and other, we need the category we created in CTexploreR.
Hereunder is what we used for our selection pipeline (coming from
make_CT_list.R in CTdata), mainly how we
created the data.
# GTEX data with the tissue specificity category determined
all_genes <- as_tibble(rowData(GTEX_data), rownames = "ensembl_gene_id")
all_genes$TPM_testis <- assay(GTEX_data)[, "Testis"]
# Add multimapping_analysis assessing testis-specificity of genes "lowly_expressed"
all_genes <- all_genes %>%
left_join(as_tibble(rowData(normal_tissues_multimapping_data),
rownames = "ensembl_gene_id"))
# Summarise both specificity
all_genes <- all_genes %>%
mutate(testis_specificity = case_when(
GTEX_category == "testis_specific" |
multimapping_analysis == "testis_specific" ~ "testis_specific",
GTEX_category == "testis_preferential" ~ "testis_preferential"))
# Add testis scRNA seq highest expressed cell type
all_genes <- all_genes %>%
left_join(as_tibble(rowData(testis_sce)) %>%
dplyr::select(external_gene_name, testis_cell_type))
# Add higher in somatic scRNAseq data of normal tissues from the Human Protein Atlas
all_genes <- all_genes %>%
left_join(as_tibble(rowData(scRNAseq_HPA), rownames = "ensembl_gene_id") %>%
dplyr::select(ensembl_gene_id, external_gene_name,
Higher_in_somatic_cell_type))
# CCLE database analysis added
all_genes <- all_genes %>%
left_join(as_tibble(rowData(CCLE_data), rownames = "ensembl_gene_id"))
all_genes[is.na(all_genes$CCLE_category), "CCLE_category"] <- "not_in_CCLE"
# TCGA database analysis added
all_genes <- all_genes %>%
left_join(as_tibble(rowData(TCGA_TPM), rownames = "ensembl_gene_id") %>%
dplyr::select(ensembl_gene_id, external_gene_name, percent_pos_tum,
percent_neg_tum, max_TPM_in_TCGA, TCGA_category))
all_genes[all_genes$lowly_expressed_in_GTEX == TRUE &
all_genes$multimapping_analysis == "testis_specific",
"TCGA_category"] <- "multimapping_issue"
# Add DAC analysis
induced <- as_tibble(rowData(DAC_treated_cells_multimapping),
rownames = "ensembl_gene_id") %>%
filter(induced == TRUE) %>%
pull(external_gene_name)
all_genes <- all_genes %>%
mutate(DAC_induced = case_when(external_gene_name %in% induced ~ TRUE,
!external_gene_name %in% induced ~ FALSE))
# Add the most biologically relevant transcript (canonical and coordinates)
ensembl <- biomaRt::useDataset("hsapiens_gene_ensembl", useMart("ensembl"))
attributes_vector <- c("ensembl_gene_id",
"external_gene_name",
"ensembl_transcript_id",
"external_transcript_name",
"chromosome_name",
"strand",
"transcript_start",
"transcript_end",
"transcription_start_site",
"transcript_length",
"transcript_biotype",
"transcript_is_canonical")
transcripts_infos <- as_tibble(biomaRt::getBM(attributes = attributes_vector,
mart = ensembl))
canonical_transcripts <- transcripts_infos %>%
filter(transcript_is_canonical == 1) %>%
filter(chromosome_name %in% c(1:22, "X", "Y", "MT")) %>%
filter(transcript_biotype == "protein_coding" | transcript_biotype == "lncRNA")
all_genes <- all_genes %>%
left_join(canonical_transcripts %>%
dplyr::select(ensembl_gene_id,
external_transcript_name, ensembl_transcript_id,
chromosome_name, strand, transcript_start,
transcript_end, transcription_start_site,
transcript_length, transcript_biotype))
From there, we filtered based on the testis_specificity (“testis_specific” or “testis_preferential”), testis_cell_type (not “Macrophage”, “Endothelial”, “Myoid”, “Sertoli”, “Leydig”), Higher_in_somatic_cell_type (TRUE), CCLE_category (“activated”) and TCGA_category (“activated” or “multimapping_issue”) to have our CT genes. Then the most relevant transcript was validated manually (and check that there is a transcript activated in testis and the same is activated in tumor).
CTdatabase_ours <- Venn(list(CTdatabase = CTdatabase_cleaned$external_gene_name,
CTexploreR = CT_genes$external_gene_name))
gp <- VennThemes(compute.Venn(CTdatabase_ours))
gp[["Face"]][["11"]]$fill <- "mistyrose"
gp[["Face"]][["01"]]$fill <- "darkseagreen1"
gp[["Face"]][["10"]]$fill <- "lightsteelblue1"
gp[["Set"]][["Set1"]]$col <- "paleturquoise4"
gp[["Set"]][["Set2"]]$col <- "darkseagreen4"
gp[["SetText"]][["Set1"]]$col <- "paleturquoise4"
gp[["SetText"]][["Set2"]]$col <- "darkseagreen4"
plot(CTdatabase_ours, gp = gp)
We find 29.0322581 % of CTdatabase in CTexploreR, which is 41.1428571 % of our database.
Lost genes analysis
CTdatabase_lost <- all_genes %>%
filter(external_gene_name %in% CTdatabase_ours@IntersectionSets[["10"]])
# 12 Genes are lost because not in any database
CTdatabase_lost[is.na(CTdatabase_lost$testis_specificity), ]$testis_specificity <- "not_specific"
table(CTdatabase_lost$testis_specificity)
##
## not_specific testis_preferential testis_specific
## 85 28 49
table(CTdatabase_lost$testis_cell_type)
##
## Early_spermatocyte Elongated_spermatid Late_spermatocyte Leydig
## 18 15 17 2
## Round_spermatid Sertoli Sperm1 Sperm2
## 44 2 6 10
## Spermatogonia SSC
## 6 16
table(CTdatabase_lost$Higher_in_somatic_cell_type)
##
## FALSE TRUE
## 116 28
table(CTdatabase_lost$TCGA_category)
##
## activated leaky multimapping_issue not_activated
## 68 46 13 35
table(CTdatabase_lost$CCLE_category)
##
## activated leaky not_activated
## 80 32 50
table(CTdatabase_lost$TCGA_category, CTdatabase_lost$CCLE_category)
##
## activated leaky not_activated
## activated 50 1 17
## leaky 13 31 2
## multimapping_issue 6 0 7
## not_activated 11 0 24
69.7530864% of these genes are not testis specific.
69.1358025 % are not properly activated in tumors and/or cancer cell lines.
core_ours <- Venn(list(Jamin = Jamin_core_CT$Gene,
CTexploreR = CT_genes$external_gene_name))
Wang_ours <- Venn(list(Wang = Wang_CT$external_gene_name,
CTexploreR = CT_genes$external_gene_name))
Carter_ours <- Venn(list(Carter_CT = Carter_CT$Gene_Name,
CTexploreR = CT_genes$external_gene_name))
Bruggeman_ours <- Venn(list(Bruggeman = Bruggeman_CT$Gene_Name,
CTexploreR = CT_genes$external_gene_name))
gene_list <- list(CTexploreR = CT_genes$external_gene_name,
Carter = Carter_CT$Gene_Name,
Jamin = Jamin_core_CT$Gene,
CTatlas = Wang_CT$external_gene_name,
Bruggeman = Bruggeman_CT$Gene_Name)
upset_omics <- fromList(gene_list)
upset(upset_omics)
4 in all, 59 in at least 3 databases
Lost genes analysis
plot(core_ours, gp = gp)
Jamin_lost <- all_genes %>%
filter(external_gene_name %in% core_ours@IntersectionSets[["10"]])
Jamin_lost[is.na(Jamin_lost$testis_specificity), ]$testis_specificity <- "not_specific"
table(Jamin_lost$testis_specificity)
##
## not_specific testis_preferential testis_specific
## 78 8 10
table(Jamin_lost$testis_cell_type)
##
## Early_spermatocyte Elongated_spermatid Endothelial Late_spermatocyte
## 10 10 1 15
## Macrophage Round_spermatid Sertoli Sperm1
## 1 23 2 4
## Sperm2 Spermatogonia SSC
## 4 8 4
table(Jamin_lost$Higher_in_somatic_cell_type)
##
## FALSE TRUE
## 68 22
table(Jamin_lost$TCGA_category)
##
## activated leaky multimapping_issue not_activated
## 20 56 6 14
table(Jamin_lost$CCLE_category)
##
## activated leaky not_activated
## 50 42 4
We find 23.2 % of CTdatabase in CTexploreR, which is 16.5714286 % of our database.
89.5833333% of these genes are not testis specific.
80.2083333 % are not properly activated in tumors and/or cancer cell lines.
plot(Wang_ours, gp = gp)
Wang_lost <- all_genes %>%
filter(external_gene_name %in% Wang_ours@IntersectionSets[["10"]])
Wang_lost[is.na(Wang_lost$testis_specificity), ]$testis_specificity <- "not_specific"
table(Wang_lost$testis_specificity)
##
## not_specific testis_preferential testis_specific
## 461 175 264
table(Wang_lost$testis_cell_type)
##
## Early_spermatocyte Elongated_spermatid Endothelial Late_spermatocyte
## 78 142 1 85
## Leydig Myoid Round_spermatid Sertoli
## 1 1 334 18
## Sperm1 Sperm2 Spermatogonia SSC
## 43 76 26 52
table(Wang_lost$Higher_in_somatic_cell_type)
##
## FALSE TRUE
## 805 66
table(Wang_lost$TCGA_category)
##
## activated leaky multimapping_issue not_activated
## 382 296 6 216
table(Wang_lost$CCLE_category)
##
## activated leaky not_activated
## 337 203 360
We find 8.9303238 % of CTdatabase in CTexploreR, which is 52 % of our database.
70.6666667% of these genes are not testis specific.
76.3333333 % are not properly activated in tumors and/or cancer cell lines.
plot(Carter_ours, gp = gp)
Carter_lost <- all_genes %>%
filter(external_gene_name %in% Carter_ours@IntersectionSets[["10"]])
Carter_lost[is.na(Carter_lost$testis_specificity), ]$testis_specificity <- "not_specific"
table(Carter_lost$testis_specificity)
##
## not_specific testis_preferential testis_specific
## 1 11 50
table(Carter_lost$testis_cell_type)
##
## Early_spermatocyte Elongated_spermatid Late_spermatocyte Round_spermatid
## 4 17 3 24
## Sertoli Sperm1 Spermatogonia SSC
## 2 2 2 5
table(Carter_lost$Higher_in_somatic_cell_type)
##
## FALSE TRUE
## 59 3
table(Carter_lost$TCGA_category)
##
## activated leaky not_activated
## 34 9 19
table(Carter_lost$CCLE_category)
##
## activated leaky not_activated
## 21 3 38
We find 38.8349515 % of CTdatabase in CTexploreR, which is 22.8571429 % of our database.
19.3548387% of these genes are not testis specific.
77.4193548 % are not properly activated in tumors and/or cancer cell lines.
plot(Bruggeman_ours, gp = gp)
Bruggeman_lost <- all_genes %>%
filter(external_gene_name %in% Bruggeman_ours@IntersectionSets[["10"]])
Bruggeman_lost[is.na(Bruggeman_lost$testis_specificity), ]$testis_specificity <- "not_specific"
table(Bruggeman_lost$testis_specificity)
##
## not_specific testis_preferential testis_specific
## 653 34 17
table(Bruggeman_lost$testis_cell_type)
##
## Early_spermatocyte Elongated_spermatid Endothelial Late_spermatocyte
## 105 86 11 79
## Leydig Macrophage Myoid Round_spermatid
## 3 9 13 161
## Sertoli Sperm1 Sperm2 Spermatogonia
## 28 7 14 69
## SSC
## 91
table(Bruggeman_lost$Higher_in_somatic_cell_type)
##
## FALSE TRUE
## 344 334
table(Bruggeman_lost$TCGA_category)
##
## activated leaky multimapping_issue not_activated
## 156 524 1 23
table(Bruggeman_lost$CCLE_category)
##
## activated leaky not_activated
## 242 432 30
We find 1.7195767 % of CTdatabase in CTexploreR, which is 7.4285714 % of our database.
97.5852273% of these genes are not testis specific.
81.1079545 % are not properly activated in tumors and/or cancer cell lines.
common <- unique(c(core_ours@IntersectionSets[["11"]],
CTdatabase_ours@IntersectionSets[["11"]],
Wang_ours@IntersectionSets[["11"]],
Carter_ours@IntersectionSets[["11"]],
Bruggeman_ours@IntersectionSets[["11"]]))
length(common)
## [1] 118
length(common)/dim(CT_genes)[1] * 100
## [1] 67.42857
lost_list <- unique(c(core_ours@IntersectionSets[["10"]],
CTdatabase_ours@IntersectionSets[["10"]],
Wang_ours@IntersectionSets[["10"]],
Carter_ours@IntersectionSets[["10"]],
Bruggeman_ours@IntersectionSets[["10"]]))
lost <- all_genes %>%
filter(external_gene_name %in% lost_list)
not_specific <- filter(lost, is.na(testis_specificity))
GTEX_expression(not_specific$external_gene_name, units = "log_TPM")
somatic_testis <- filter(lost, testis_cell_type %in% c( "Macrophage",
"Endothelial", "Myoid",
"Sertoli", "Leydig"))
testis_expression(somatic_testis$external_gene_name, cells = "all")
## `use_raster` is automatically set to TRUE for a matrix with more than
## 2000 columns You can control `use_raster` argument by explicitly
## setting TRUE/FALSE to it.
##
## Set `ht_opt$message = FALSE` to turn off this message.
somatic_tissue <- filter(lost, Higher_in_somatic_cell_type == TRUE)
HPA_cell_type_expression(somatic_tissue$external_gene_name)
not_TCGA_activated <- filter(lost, TCGA_category != "activated" &
TCGA_category != "multimapping_issue")
TCGA_expression(not_TCGA_activated$external_gene_name,
tumor = "all",
units = "log_TPM")
## `use_raster` is automatically set to TRUE for a matrix with more than
## 2000 columns You can control `use_raster` argument by explicitly
## setting TRUE/FALSE to it.
##
## Set `ht_opt$message = FALSE` to turn off this message.
not_CCLE_activated <- filter(lost, CCLE_category != "activated")
CCLE_expression(not_CCLE_activated$external_gene_name,
type = c("lung", "skin", "colorectal",
"gastric", "breast", "head_and_neck"),
units = "log_TPM")
transcript_prob <- lost %>%
filter(testis_specificity == "testis_specific" |
testis_specificity == "testis_preferential") %>%
filter(TCGA_category == "activated" | TCGA_category == "multimapping_issue") %>%
filter(CCLE_category == "activated") %>%
dim()
We have lost 1694 genes in total. Among them, 66.5289256% are not considered testis specific, 5.1948052% are expressed in testis somatic cells, 25.501771% are expressed in somatic cells, 62.8689492% are not activated in TCGA samples, 60.9799292% are not activated in CCLE cell lines and 6.9067296% is lost due to transcripts problems.
What about new genes in CTexploreR
new <- CT_genes %>%
filter(!external_gene_name%in%common)
new
table(new$testis_specificity)
##
## testis_specific
## 57
table(new$X_linked, new$regulated_by_methylation)
##
## FALSE TRUE
## FALSE 25 21
## TRUE 0 11
TCGA_expression(tumor = "all", genes = new$external_gene_name,
units = "log_TPM")
## `use_raster` is automatically set to TRUE for a matrix with more than
## 2000 columns You can control `use_raster` argument by explicitly
## setting TRUE/FALSE to it.
##
## Set `ht_opt$message = FALSE` to turn off this message.
TCGA_expression(tumor = "all",
genes = filter(new, X_linked & regulated_by_methylation)$external_gene_name,
units = "log_TPM")
## `use_raster` is automatically set to TRUE for a matrix with more than
## 2000 columns You can control `use_raster` argument by explicitly
## setting TRUE/FALSE to it.
##
## Set `ht_opt$message = FALSE` to turn off this message.
There are 57 new CT genes in CTexploreR. These are mainly testis
specific and on autosomes. Regulation by methylation is not the majority
of them. This makes sens as other databases have been focused on
regulation by methylation, so as here we don’t segregate with that we
can find these. There is only 11 new “major” CT that are on the X
chromosome and regulated by methylation. CT45 are not that new.
Expression in tumours doesn’t strike that much.
table(CT_genes$testis_specificity)
##
## testis_specific
## 175
table(CT_genes$transcript_biotype)
##
## lncRNA protein_coding
## 34 141
Most genes are testis specific (100%). Most genes are mainly protein coding genes (80.5714286%).
In CTexploreR, genes have been characterised as regulated by methylation or not.
table(CT_genes$X_linked)
##
## FALSE TRUE
## 97 78
table(CT_genes$regulated_by_methylation)
##
## FALSE TRUE
## 50 125
table(CT_genes$X_linked, CT_genes$testis_specificity)
##
## testis_specific
## FALSE 97
## TRUE 78
table(CT_genes$X_linked, CT_genes$regulated_by_methylation)
##
## FALSE TRUE
## FALSE 46 51
## TRUE 4 74
Genes are enriched on the X chromosome (44.5714286%). Also, 125 genes have been flagged as regulated by methylation (71.4285714%). It is interesting to study the link between these two characteristics.
On the chromosome X, there is a clear enrichment of CT genes regulated by methylation (74/78 chrX genes or 74/125 genes regulated by methylation).
Let’s check that with a statistical test, I here want to see if, on there is an enrichment of genes regulated by methylation on the X chromosome. I need to do a Pearson Chi square test (to know if observed proportion differ from expected proportion). It is a statistical method to determine if two categorical variables have a significant correlation between them. I can directly put a matrix (my table) in the function
chisq.test(table(CT_genes$X_linked, CT_genes$regulated_by_methylation))
##
## Pearson's Chi-squared test with Yates' continuity correction
##
## data: table(CT_genes$X_linked, CT_genes$regulated_by_methylation)
## X-squared = 35.852, df = 1, p-value = 2.129e-09
CT_genes$chr_factor <- factor(CT_genes$chr,
levels = c("1", "2", "3", "4", "5", "6", "7", "8", "9",
"10", "11", "12", "13", "14", "15", "16", "17",
"18", "19", "20", "21", "22", "X", "Y"))
CT_genes %>%
mutate(regulated_by_methylation = ifelse(regulated_by_methylation,
"Regulated by methylation",
"Not regulated by methylation")) %>%
ggplot(aes(x = chr_factor, fill = X_linked)) +
geom_bar(stat = 'count') +
scale_fill_manual(values = c("royalblue1", "deeppink")) +
facet_wrap(~ regulated_by_methylation, ncol = 2) +
theme_bw() +
xlab("Chromosome") +
ylab("Number of genes") +
theme(axis.text.x = element_text(angle = 90, hjust = 1),
legend.position = "none",
axis.title = element_text(size = 10, color = "gray10"))
There is indeed a significative link between regulation by methylation and being on the X chromosome. There is thus an enrichment of CT genes regulated by methylation on the X chromosome (and inversely).
Using CTexploreR functions, we can explore all CT genes or focus on some potential targets.
For these heatmaps, the code comes from the function but has been copies to add some annotations.
chr_mat <- as.matrix(CT_genes$X_linked)
chr_mat <- ifelse(chr_mat == TRUE, "X-linked", "Not X")
rownames(chr_mat) <- CT_genes$external_gene_name
row_ha_chr <- rowAnnotation(chr_factor = chr_mat,
annotation_legend_param = legends_param,
simple_anno_size = unit(0.5, "cm"),
col = list(chr_factor = chr_colors),
annotation_name_gp = gpar(fontsize = 8),
annotation_name_side = "top")
regulation_mat <- as.matrix(CT_genes$regulated_by_methylation)
regulation_mat <- ifelse(regulation_mat == TRUE, "Methylation",
"Not methylation")
rownames(regulation_mat) <- CT_genes$external_gene_name
row_ha_reg <- rowAnnotation(regulation = regulation_mat,
annotation_legend_param = legends_param,
simple_anno_size = unit(0.5, "cm"),
col = list(regulation = meth_colors),
annotation_name_gp = gpar(fontsize = 8),
annotation_name_side = "top")
left_annot <- c(row_ha_chr, row_ha_reg, gap = unit(1, "mm"))
split <- data.frame(CT_genes$regulated_by_methylation, CT_genes$X_linked)
database <- TCGA_TPM
database$tumor <- sub(pattern = 'TCGA-', x = database$project_id, '')
database$type <- "Tumor"
database$type[database$shortLetterCode == "NT"] <- "Peritumoral"
database <- database[, order(database$tumor, database$type)]
genes <- CT_genes$external_gene_name
database <- database[rowData(database)$external_gene_name %in% genes, ]
database <- database[match(genes, rowData(database)$external_gene_name), ]
database <- database[, database$type == "Tumor"]
column_ha_tumor <- HeatmapAnnotation(
Tumor = database$tumor,
border = TRUE,
col = list(Tumor = c("BRCA" = "midnightblue", "COAD" = "darkorchid2",
"ESCA" = "gold", "HNSC" = "deeppink2",
"LUAD" = "seagreen", "LUSC" = "seagreen3",
"SKCM" = "red3")),
annotation_name_gp = gpar(fontsize = 8),
annotation_legend_param = legends_param)
split_by <- factor(database$tumor)
col_annot <- column_ha_tumor
mat <- log1p(assay(database))
rownames(mat) <- rowData(database)$external_gene_name
Heatmap(mat[, , drop = FALSE],
name = "logTPM",
column_title = paste0("Expression in TCGA samples (all)"),
column_split = split_by,
row_split = split,
row_title_gp = gpar(fontsize = 0),
col = colorRamp2(seq(0, max(mat), length = 11),
legend_colors),
clustering_method_rows = "ward.D",
clustering_method_columns = "ward.D",
cluster_rows = TRUE,
show_column_names = FALSE,
cluster_columns = TRUE,
show_column_dend = FALSE,
show_row_dend = FALSE,
row_names_gp = gpar(fontsize = 4),
heatmap_legend_param = legends_param,
top_annotation = col_annot,
left_annotation = left_annot)
database <- CCLE_data
database$type <- tolower(database$type)
genes <- CT_genes$external_gene_name
database <- database[rowData(database)$external_gene_name %in% genes, ]
database <- database[match(genes, rowData(database)$external_gene_name), ]
mat <- log1p(assay(database))
rownames(mat) <- rowData(database)$external_gene_name
df_col <- data.frame("cell_line" = colData(database)$cell_line_name,
"type" = as.factor(colData(database)$type))
rownames(df_col) <- rownames(colData(database))
df_col <- df_col[order(df_col$type), ]
column_ha_type <- HeatmapAnnotation(
type = df_col$type,
border = TRUE,
annotation_name_gp = gpar(fontsize = 8),
annotation_legend_param = legends_param,
col = list(type = c("lung" = "seagreen3", "skin" = "red3",
"bile_duct" = "mediumpurple1", "bladder" = "mistyrose2",
"colorectal" = "plum", "lymphoma" = "steelblue1",
"uterine" = "darkorange4", "myeloma" = "turquoise3",
"kidney" = "thistle4",
"pancreatic" = "darkmagenta", "brain" = "palegreen2",
"gastric" = "wheat3", "breast" = "midnightblue",
"bone" = "sienna1", "head_and_neck" = "deeppink2",
"ovarian" = "tan3", "sarcoma" = "lightcoral",
"leukemia" = "steelblue4", "esophageal"= "khaki",
"neuroblastoma" = "olivedrab1")))
Heatmap(mat[, rownames(df_col), drop = FALSE],
name = "logTPM",
column_title = "Gene Expression in tumor cell lines (CCLE)",
column_split = factor(df_col$type),
row_split = split,
row_title_gp = gpar(fontsize = 0),
col = colorRamp2(seq(0, max(mat), length = 11),
legend_colors),
clustering_method_rows = "ward.D",
clustering_method_columns = "ward.D",
cluster_rows = TRUE,
show_row_dend = FALSE,
show_column_names = FALSE,
cluster_columns = TRUE,
show_column_dend = FALSE,
row_names_gp = gpar(fontsize = 4),
heatmap_legend_param = legends_param,
top_annotation = c(column_ha_type),
left_annotation = left_annot)
Next graph was removed from paper
genes <- CT_genes$external_gene_name
database <- CT_mean_methylation_in_tissues[rownames(CT_mean_methylation_in_tissues) %in% genes]
mat <- na.omit(assay(database))
clustering_option <- TRUE
row_ha_chr_meth <- rowAnnotation(chr = chr_mat[rownames(mat),],
annotation_legend_param = legends_param,
simple_anno_size = unit(0.5, "cm"),
col = list(chr = chr_colors),
annotation_name_gp = gpar(fontsize = 8),
annotation_name_side = "top")
row_ha_reg_meth <- rowAnnotation(regulation = regulation_mat[rownames(mat),],
annotation_legend_param = legends_param,
simple_anno_size = unit(0.5, "cm"),
col = list(regulation = meth_colors),
annotation_name_gp = gpar(fontsize = 8),
annotation_name_side = "top")
left_annot_meth <- c(row_ha_chr_meth, row_ha_reg_meth, gap = unit(1, "mm"))
split_meth <- data.frame(filter(CT_genes, external_gene_name %in% rownames(mat))$regulated_by_methylation,
filter(CT_genes, external_gene_name %in% rownames(mat))$X_linked)
Heatmap(mat,
column_title = 'Promoter mean methylation level by tissue',
name = 'Meth',
col = colorRamp2(c(1:100),
colorRampPalette(c("moccasin","dodgerblue4"))(100)),
na_col = "gray80",
cluster_rows = clustering_option,
cluster_columns = FALSE,
row_split = split_meth,
row_title_gp = gpar(fontsize = 0),
show_row_names = TRUE,
show_heatmap_legend = TRUE,
show_row_dend = FALSE,
row_names_gp = gpar(fontsize = 3),
column_names_gp = gpar(fontsize = 8),
column_names_side = "bottom",
row_names_side = "right",
left_annotation = left_annot_meth)
MAGE_genes <- filter(CT_genes, family == "MAGE")$external_gene_name
TCGA_expression(tumor = "all",
genes = MAGE_genes,
units = "log_TPM")
## `use_raster` is automatically set to TRUE for a matrix with more than
## 2000 columns You can control `use_raster` argument by explicitly
## setting TRUE/FALSE to it.
##
## Set `ht_opt$message = FALSE` to turn off this message.
CCLE_expression(genes = MAGE_genes,
type = c("lung", "skin", "bile_duct", "bladder",
"colorectal", "lymphoma", "uterine",
"myeloma", "kidney", "pancreatic", "brain",
"gastric", "breast", "bone", "head_and_neck",
"ovarian", "sarcoma", "leukemia", "esophageal",
"neuroblastoma"), units = "log_TPM")
normal_tissues_mean_methylation(MAGE_genes, na.omit = TRUE)
In CTexploreR, we there is single cell data from the testis, we thus can analyse CT genes expression during spermatogenesis.
There is only data for 137 of our 175 CT genes.
NB : SSC, Spermatogonia and early spermatocytes are premeiotic cells. Late spermatocytes (between both meiosis), round spermatid, elongated spermatid and sperm are postmeiotic cells.
genes_avail <-
CT_genes$external_gene_name[CT_genes$external_gene_name %in% unique(rownames(testis_sce))]
table(CT_genes$testis_cell_type)
##
## Early_spermatocyte Elongated_spermatid Late_spermatocyte Round_spermatid
## 31 6 11 24
## Sperm1 Sperm2 Spermatogonia SSC
## 6 5 22 23
table(CT_genes$testis_cell_type)/length(genes_avail)*100
##
## Early_spermatocyte Elongated_spermatid Late_spermatocyte Round_spermatid
## 22.627737 4.379562 8.029197 17.518248
## Sperm1 Sperm2 Spermatogonia SSC
## 4.379562 3.649635 16.058394 16.788321
55.4744526 % of genes are mainly expressed pre-meioticly.
germ_cells <- c("SSC", "Spermatogonia", "Early_spermatocyte",
"Late_spermatocyte","Round_spermatid", "Elongated_spermatid",
"Sperm1", "Sperm2")
somatic_cells <- c("Macrophage", "Endothelial", "Myoid", "Sertoli", "Leydig")
testis_sce_CT <- testis_sce[genes_avail, ]
mat <- SingleCellExperiment::logcounts(testis_sce_CT)
df_col <- data.frame(clusters = colData(testis_sce_CT)$clusters,
type = colData(testis_sce_CT)$type,
Donor = colData(testis_sce_CT)$Donor)
rownames(df_col) <- colnames(testis_sce_CT)
df_col <- df_col[order(df_col$type),]
df_col$lineage <- "Germ cells"
df_col$lineage[df_col$type %in% somatic_cells] <- "Somatic cells"
column_ha_type = HeatmapAnnotation(
type = df_col$type,
border = TRUE,
col = list(type = c("SSC" = "floralwhite", "Spermatogonia" = "moccasin",
"Early_spermatocyte" = "gold",
"Late_spermatocyte" = "orange",
"Round_spermatid" = "red2",
"Elongated_spermatid" = "darkred",
"Sperm1" = "violet", "Sperm2" = "purple",
"Sertoli" = "gray",
"Leydig" = "cadetblue2", "Myoid" = "springgreen3",
"Macrophage" = "gray10",
"Endothelial" = "steelblue")),
annotation_name_gp = gpar(fontsize = 8),
annotation_legend_param = legends_param)
column_ha_lineage = HeatmapAnnotation(
lineage = df_col$lineage,
border = TRUE,
col = list(lineage = c("Germ cells" = "salmon", "Somatic cells" = "cyan4")),
annotation_name_gp = gpar(fontsize = 8),
annotation_legend_param = legends_param)
scale_lims <- c(0, quantile(rowMax(mat), 0.75))
top_annot <- c(column_ha_lineage, column_ha_type)
# Until here is what's in the function, hereunder is my addition/change in Heatmap()
CT_genes_avail <- filter(CT_genes, external_gene_name %in% genes_avail)
chr_mat <- as.matrix(CT_genes_avail$X_linked)
chr_mat <- ifelse(chr_mat == TRUE, "X-linked", "Not X")
rownames(chr_mat) <- CT_genes_avail$external_gene_name
row_ha_chr <- rowAnnotation(chr = chr_mat,
annotation_legend_param = legends_param,
simple_anno_size = unit(0.5, "cm"),
col = list(chr = chr_colors),
annotation_name_gp = gpar(fontsize = 8),
annotation_name_side = "top")
regulation_mat <- as.matrix(CT_genes_avail$regulated_by_methylation)
regulation_mat <- ifelse(regulation_mat == TRUE, "Methylation",
"Not methylation")
rownames(regulation_mat) <- CT_genes_avail$external_gene_name
row_ha_reg <- rowAnnotation(regulation = regulation_mat,
annotation_legend_param = legends_param,
simple_anno_size = unit(0.5, "cm"),
col = list(regulation = meth_colors),
annotation_name_gp = gpar(fontsize = 8),
annotation_name_side = "top")
left_annot <- c(row_ha_chr, row_ha_reg, gap = unit(1, "mm"))
split <- data.frame(CT_genes_avail$regulated_by_methylation, CT_genes_avail$X_linked)
Heatmap(mat[genes_avail, rownames(df_col), drop = FALSE],
name = "logCounts",
column_title = "Expression in testis cells (scRNAseq)",
column_split = df_col$type,
row_split = split,
row_title_gp = gpar(fontsize = 0),
show_column_names = FALSE,
show_column_dend = FALSE,
clustering_method_rows = "ward.D",
clustering_method_columns = "ward.D",
cluster_rows = TRUE,
cluster_columns = FALSE,
show_row_dend = FALSE,
row_names_gp = gpar(fontsize = 4),
col = colorRamp2(seq(scale_lims[1], scale_lims[2], length = 11),
legend_colors),
top_annotation = top_annot,
left_annotation = left_annot,
heatmap_legend_param = legends_param)
We used these data to determine in which testis cell type each gene is mostly expressed.
CT_genes$main_cell_type_expression <- factor(CT_genes$testis_cell_type,
levels = c("SSC",
"Spermatogonia",
"Early_spermatocyte",
"Late_spermatocyte",
"Round_spermatid",
"Elongated_spermatid",
"Sperm1",
"Sperm2",
"Sertoli"))
CT_genes %>%
mutate(regulated_by_methylation = ifelse(regulated_by_methylation,
"Regulated by methylation",
"Not regulated by methylation")) %>%
filter(!is.na(testis_cell_type)) %>%
filter(main_cell_type_expression != "Sertoli") %>%
ggplot(aes(x = chr, fill = main_cell_type_expression,
color = main_cell_type_expression)) +
scale_fill_manual(values = c("lightyellow2", "moccasin", "gold", "orange","red2",
"darkred", "violet", "purple")) +
scale_color_manual(values = c("lightyellow3", "navajowhite2", "gold", "orange","red2",
"darkred", "violet", "purple")) +
geom_bar(stat = 'count') +
facet_grid(main_cell_type_expression ~ regulated_by_methylation) +
xlab("Chromosome") +
ylab("Number of genes") +
theme_bw() +
theme(
axis.text.x = element_text(color = "black", angle = 90, vjust = 0.5, size = 10),
axis.title = element_text(size = 10, color = "gray10"),
strip.text.y = element_blank())
CT_genes %>%
filter(X_linked) %>%
filter(testis_cell_type %in% c("Late_spermatocyte","Round_spermatid",
"Elongated_spermatid", "Sperm1", "Sperm2"))
These genes are on the X chromosome but escape X chromosome inactivation and are activates post-meioticly.
All CT genes function
msigdbr(species = "Homo sapiens" , category = "C5") %>%
filter(gene_symbol %in% CT_genes$external_gene_name) %>%
pull(gene_symbol) %>%
unique() %>%
length()
## [1] 106
msigdbr(species = "Homo sapiens" , category = "H") %>%
filter(gene_symbol %in% CT_genes$external_gene_name) %>%
pull(gene_symbol) %>%
unique() %>%
length()
## [1] 7
go_ora <- enrichGO(gene = CT_genes$ensembl_gene_id,
keyType = "ENSEMBL",
OrgDb = org.Hs.eg.db,
ont = "all",
readable = TRUE)
as_tibble(go_ora)
as_tibble(go_ora) %>%
arrange(desc(Count)) %>%
head(12) %>%
mutate(Ratio = case_when(ONTOLOGY == "BP"~ Count/177,
ONTOLOGY == "CC"~ Count/197,
ONTOLOGY == "MF"~ Count/186)) %>%
ggplot(aes(x = Ratio, y = Description, fill = Description)) +
geom_col() +
theme_bw() +
ylab("GO term") +
xlab("Gene Ratio") +
theme(axis.text.y = element_blank(),
legend.position = "none",
axis.ticks.y = element_blank(),
axis.title = element_text(size = 10, color = "gray10"))+
geom_text(aes(0, y = Description, label = Description),
hjust = 0,
nudge_x = 0.005,
colour = "floralwhite",
size = 4)
ora_to_plot <- as_tibble(simplify(go_ora))
ora_to_plot <- ora_to_plot %>%
arrange(desc(Count)) %>%
head(9) %>%
mutate(Ratio = case_when(ONTOLOGY == "BP"~ Count/102,
ONTOLOGY == "MF"~ Count/186))
ora_to_plot %>%
ggplot(aes(x = Ratio, y = Description, fill = Description)) +
geom_col() +
theme_bw() +
ylab("GO term") +
xlab("Gene Ratio") +
theme(axis.text.y = element_blank(),
legend.position = "none",
axis.ticks.y = element_blank(),
axis.title = element_text(size = 10, color = "gray10"))+
geom_text(aes(0, y = Description, label = Description),
hjust = 0,
nudge_x = 0.005,
colour = "floralwhite",
size = 3.7)
As we can see here, most of genes are indeed linked to functions from reproduction. I represented here the 12 categories with the most genes, all from biological processes. However, they are enriched in 54 different GO terms.
Is there a difference between meth reg or not ?
go_ora_meth <- enrichGO(gene =
filter(CT_genes, regulated_by_methylation)$ensembl_gene_id,
keyType = "ENSEMBL",
OrgDb = org.Hs.eg.db,
ont = "all",
readable = TRUE)
go_ora_meth <- simplify(go_ora_meth)
as_tibble(go_ora_meth)
go_ora_not_meth <- enrichGO(gene =
filter(CT_genes, !regulated_by_methylation)$ensembl_gene_id,
keyType = "ENSEMBL",
OrgDb = org.Hs.eg.db,
ont = "all",
readable = TRUE)
go_ora_not_meth <- simplify(go_ora_not_meth)
as_tibble(go_ora_not_meth)
go_ora_meth_plot <- as_tibble(go_ora_meth) %>%
arrange(desc(Count)) %>%
head(10) %>%
mutate(Ratio = case_when(ONTOLOGY == "BP"~ Count/73,
ONTOLOGY == "MF"~ Count/80,
ONTOLOGY == "CC"~ Count/78)) %>%
mutate(regulation = "Methylation")
go_ora_not_meth_plot <- as_tibble(go_ora_not_meth) %>%
arrange(desc(Count)) %>%
head(8) %>%
mutate(Ratio = case_when(ONTOLOGY == "BP"~ Count/29,
ONTOLOGY == "MF"~ Count/30))%>%
mutate(regulation = "Not methylation")
rbind(go_ora_meth_plot, go_ora_not_meth_plot) %>%
ggplot(aes(x = Ratio, y = Description, fill = Description)) +
geom_col() +
theme_bw() +
ylab("GO term") +
xlab("Gene Ratio") +
facet_wrap(~ regulation) +
theme(legend.position = "none",
axis.ticks.y = element_blank(),
axis.text.y = element_text(size = 6, color = "gray10"),
axis.title.x = element_text(size = 10, color = "gray10"),
axis.title.y = element_blank())+
geom_text(aes(0, y = Description, label = ID),
hjust = 0,
nudge_x = 0.005,
colour = "floralwhite",
size = 2)
We’ve also added a column tumor suppressor and oncogene in CTexploreR. These information come from Cancermine.
table(CT_genes$oncogene)
##
## oncogene
## 23
table(CT_genes$tumor_suppressor)
##
## tumor_suppressor
## 6
table(CT_genes$oncogene, CT_genes$tumor_suppressor)
##
## tumor_suppressor
## oncogene 4
sessionInfo()
## R version 4.3.2 (2023-10-31)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Ubuntu 22.04.4 LTS
##
## Matrix products: default
## BLAS: /usr/lib/x86_64-linux-gnu/openblas-pthread/libblas.so.3
## LAPACK: /usr/lib/x86_64-linux-gnu/openblas-pthread/libopenblasp-r0.3.20.so; LAPACK version 3.10.0
##
## locale:
## [1] LC_CTYPE=C.UTF-8 LC_NUMERIC=C LC_TIME=C.UTF-8
## [4] LC_COLLATE=C.UTF-8 LC_MONETARY=C.UTF-8 LC_MESSAGES=C.UTF-8
## [7] LC_PAPER=C.UTF-8 LC_NAME=C LC_ADDRESS=C
## [10] LC_TELEPHONE=C LC_MEASUREMENT=C.UTF-8 LC_IDENTIFICATION=C
##
## time zone: Europe/Brussels
## tzcode source: system (glibc)
##
## attached base packages:
## [1] stats4 grid stats graphics grDevices utils datasets
## [8] methods base
##
## other attached packages:
## [1] Biostrings_2.70.3 XVector_0.42.0
## [3] patchwork_1.2.0 BiocParallel_1.36.0
## [5] DOSE_3.28.2 msigdbr_7.5.1
## [7] clusterProfiler_4.10.1 org.Hs.eg.db_3.18.0
## [9] AnnotationDbi_1.64.1 SingleCellExperiment_1.24.0
## [11] circlize_0.4.16 ComplexHeatmap_2.18.0
## [13] UpSetR_1.4.0 SummarizedExperiment_1.32.0
## [15] Biobase_2.62.0 GenomicRanges_1.54.1
## [17] GenomeInfoDb_1.38.7 IRanges_2.36.0
## [19] S4Vectors_0.40.2 MatrixGenerics_1.14.0
## [21] matrixStats_1.2.0 lubridate_1.9.3
## [23] forcats_1.0.0 stringr_1.5.1
## [25] dplyr_1.1.4 purrr_1.0.2
## [27] tidyr_1.3.1 tibble_3.2.1
## [29] ggplot2_3.5.0 tidyverse_2.0.0
## [31] biomaRt_2.58.2 Vennerable_3.0
## [33] xtable_1.8-4 gtools_3.9.5
## [35] reshape_0.8.9 RColorBrewer_1.1-3
## [37] lattice_0.22-5 RBGL_1.78.0
## [39] graph_1.80.0 BiocGenerics_0.48.1
## [41] CTexploreR_0.99.5 CTdata_1.2.0
## [43] readr_2.1.5 readxl_1.4.3
##
## loaded via a namespace (and not attached):
## [1] splines_4.3.2 later_1.3.2
## [3] ggplotify_0.1.2 bitops_1.0-7
## [5] filelock_1.0.3 cellranger_1.1.0
## [7] polyclip_1.10-6 XML_3.99-0.16.1
## [9] lifecycle_1.0.4 doParallel_1.0.17
## [11] vroom_1.6.5 MASS_7.3-60.0.1
## [13] magrittr_2.0.3 sass_0.4.9
## [15] rmarkdown_2.26 jquerylib_0.1.4
## [17] yaml_2.3.8 httpuv_1.6.14
## [19] cowplot_1.1.3 DBI_1.2.2
## [21] abind_1.4-5 zlibbioc_1.48.2
## [23] ggraph_2.2.1 RCurl_1.98-1.14
## [25] yulab.utils_0.1.4 tweenr_2.0.3
## [27] rappdirs_0.3.3 GenomeInfoDbData_1.2.11
## [29] enrichplot_1.22.0 ggrepel_0.9.5
## [31] tidytree_0.4.6 codetools_0.2-19
## [33] DelayedArray_0.28.0 xml2_1.3.6
## [35] ggforce_0.4.2 tidyselect_1.2.1
## [37] shape_1.4.6.1 aplot_0.2.2
## [39] farver_2.1.1 viridis_0.6.5
## [41] BiocFileCache_2.10.1 jsonlite_1.8.8
## [43] GetoptLong_1.0.5 ellipsis_0.3.2
## [45] tidygraph_1.3.1 iterators_1.0.14
## [47] foreach_1.5.2 tools_4.3.2
## [49] progress_1.2.3 treeio_1.26.0
## [51] Rcpp_1.0.12 glue_1.7.0
## [53] gridExtra_2.3 SparseArray_1.2.4
## [55] xfun_0.42 qvalue_2.34.0
## [57] withr_3.0.0 BiocManager_1.30.22
## [59] fastmap_1.1.1 fansi_1.0.6
## [61] digest_0.6.35 gridGraphics_0.5-1
## [63] timechange_0.3.0 R6_2.5.1
## [65] mime_0.12 colorspace_2.1-0
## [67] Cairo_1.6-2 GO.db_3.18.0
## [69] RSQLite_2.3.5 utf8_1.2.4
## [71] generics_0.1.3 data.table_1.15.2
## [73] prettyunits_1.2.0 graphlayouts_1.1.1
## [75] httr_1.4.7 S4Arrays_1.2.1
## [77] scatterpie_0.2.1 pkgconfig_2.0.3
## [79] gtable_0.3.4 blob_1.2.4
## [81] shadowtext_0.1.3 htmltools_0.5.7
## [83] fgsea_1.28.0 clue_0.3-65
## [85] scales_1.3.0 png_0.1-8
## [87] ggfun_0.1.4 knitr_1.45
## [89] rstudioapi_0.15.0 tzdb_0.4.0
## [91] reshape2_1.4.4 rjson_0.2.21
## [93] nlme_3.1-164 curl_5.2.1
## [95] cachem_1.0.8 GlobalOptions_0.1.2
## [97] BiocVersion_3.18.1 parallel_4.3.2
## [99] HDO.db_0.99.1 pillar_1.9.0
## [101] vctrs_0.6.5 promises_1.2.1
## [103] dbplyr_2.5.0 cluster_2.1.6
## [105] evaluate_0.23 magick_2.8.3
## [107] cli_3.6.2 compiler_4.3.2
## [109] rlang_1.1.3 crayon_1.5.2
## [111] labeling_0.4.3 plyr_1.8.9
## [113] fs_1.6.3 stringi_1.8.3
## [115] viridisLite_0.4.2 babelgene_22.9
## [117] munsell_0.5.0 lazyeval_0.2.2
## [119] GOSemSim_2.28.1 Matrix_1.6-5
## [121] ExperimentHub_2.10.0 hms_1.1.3
## [123] bit64_4.0.5 KEGGREST_1.42.0
## [125] shiny_1.8.0 highr_0.10
## [127] interactiveDisplayBase_1.40.0 AnnotationHub_3.10.0
## [129] igraph_2.0.3 memoise_2.0.1
## [131] bslib_0.6.1 ggtree_3.10.1
## [133] fastmatch_1.1-4 bit_4.0.5
## [135] gson_0.1.0 ape_5.7-1